## Clear R-workspace
rm(list=ls(all=TRUE))

## Close all graphic devices
graphics.off()
#####################
### Load packages ###
#####################
library(pacman)

pacman::p_load(extrafont,DT,stringr,dplyr,plyr,tibble,tidyverse,strex,data.table,entropy,vegan,ggstatsplot,hrbrthemes,ggstatsplot, immunarch,reshape, pander, data.table,ggrepel,scales, binfotron,DivE)

extrafont::loadfonts(device="win")
windowsFonts(sans="Palatino Linotype")
loadfonts(device="win")
loadfonts(device="postscript")
###########################
### Main analysis paths ###
###########################
rootDir <-'D:/BIOINF/PROJECTS/2_ANALYSES/18_GIT_PAPERS/TCR_BCR_antiPD1/'

# Main
scriptsPath <- paste0("scripts/")
scriptsMainPath<- paste0(scriptsPath, "main/")
scriptsFunctionsPath <- paste0(scriptsPath,"functions/")
projectDataPath <- paste0("data/")
# Input
tcgaInputData <- paste0(projectDataPath,"TCGA/")
antiPDL1publicInputData <- paste0(projectDataPath,"antiPD(L)1_public/")
otherInputData <- paste0(projectDataPath,"other/")
referenceInputData <- paste0(projectDataPath,"reference/")
# Output
projectOutDataPath <- paste0("output/data_files/")
# tcgaIntermediateData <- paste0(projectOutDataPath,"TCGA/")
antiPDL1publicIntermediateData <- paste0(projectOutDataPath,"antiPD(L)1_public/")
# MixCR output paths
pathToMixcr <-"rna/processed/mixcr/"
pathToMixcrFilter <- "rna/processed/mixcr_tcr/"
pathToKallisto<- "rna/processed/kallisto/"
pathToSampleTables <- "metadata/"

# Session/dependencies
sessionInfoPath <- paste0("session_info/")

######################################
## Create intermediate output paths ##
######################################
# if (!dir.exists(paste0(antiPDL1publicIntermediateData,"purity/"))) {dir.create(paste0(antiPDL1publicIntermediateData,"purity/"))}


##################################
### LOAD SOURCE FUNCTIONS FILE ###
##################################
source(paste0(scriptsFunctionsPath,"aPD1_TCR_BCR_clones_analysis_functions.R"))

##########################
### FOLDER/FILES NAMES ###
##########################
# Create list with folders names for the different datasets.
datasets <- c("Hugo","Riaz","Gide","Liu","Rod","Miao","Immotion150","Kim","Mari", "Powles","Braun")
data.foldersList <- as.list(c("Hugo_MEL","Riaz_MEL","Gide_MEL", "Liu_MEL","Rodriguez_MEL","Miao_RCC","Immo150_RCC","Kim_GAS","Mari_BLA","Powles_BLA","Braun_RCC"))
names(data.foldersList)<- c("Hugo","Riaz","Gide", "Liu","Rod", "Miao","Immotion150","Kim","Mari","Powles","Braun")

############################
## Project Data Filenames ##
############################
vst.batch.dataset.RData <- "RNASeqClin_VSTbatchDataset.RData"
fpkm.uq.RData <- "RNASeqClin_FPKM.UQ.RData"


#####################
### File suffixes ###
#####################
Rdata.suffix <- ".RData"

1 Load data

Loading basic data objects and files for analysis.

1.1 Gene expression signatures

TIS-GEP and TuTack

1.2 Sample and CIBERSORT data

datatable(samples_all.merged %>% select(run_accession,dataset) %>% group_by(dataset) %>% distinct() %>% dplyr::summarize(n()), extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'excel', 'csv' ),
    scrollX=TRUE,
    pageLength=15
  ),
  caption = 'Number of patients in all datasets'
)

2 Calculate TuTack, GEP and extract PD-L1 expression

Merge with rest of sample metadata.

3 Loading MiXCR data-VERSION 2[immunarch]: filtering & downsampling

As a first step, we are using the immunarch package to load immune repertoire files into R workspace. After immunarch data are loaded, we merge them with the rest of the samples metadata information.

Check if all samples are there.

I looked into this sample from Riaz dataset (SRR5088850), and no clonotypes were found, neither TCR nor Ig.

NOTE: remember to filter for samples included in the analysis, since we are loading all available data, and some are excluded (on-treatment samples etc.)

3.1 Write new clonotype files with only TCR quantified CDR3 sequences, or only BCR quantified CDR3 sequences

Write new clonotype files with only TCR quantifies CDR3 sequences or only BCR quantified CDR3 sequences, to be further processed with VDJtools DownSample, setting a minimum of 20/50/100 clone reads in each sample. The same downsampling is applied across all samples and datasets, since we would also like to merge data across tissues, consistent downsampling is important.

We are not taking the immunarch loaded data, I rather load the initial tables, process them and write a new file for each sample of each dataset.

NOTE: This is only used to explore thresholds, sample clone reads, not for downstream analysis.

3.2 Clone Reads of TCR (alpha, beta, gamma, delta) & BCR-IGH CDR3 sequences

To decide the downsampling I am counting the clone counts in all the files, and then check how many files per dataset will be possibly removed with the downsampling threshold applied.

3.2.1 TCR reads

NUMBER OF SAMPLES WITH TCR CLONE READS ABOVE ZERO


NUMBER OF SAMPLES WITH TCR CLONE READS EQUAL TO ZERO


SAMPLES WITH TCR CLONE READS EQUAL TO ZERO


NUMBER OF SAMPLES WITH TCR CLONE READS EQUAL TO ONE


NUMBER OF SAMPLES WITH TCR CLONE READS ABOVE 50


Summarizing table

3.2.2 BCR-IGH reads

NUMBER OF SAMPLES WITH BCR-IGH CLONE READS ABOVE ZERO


NUMBER OF SAMPLES WITH BCR-IGH CLONE READS EQUAL TO ZERO


SAMPLES WITH BCR-IGH CLONE READS EQUAL TO ZERO


NUMBER OF SAMPLES WITH BCR-IGH CLONE READS EQUAL TO ONE


NUMBER OF SAMPLES WITH BCR-IGH CLONE READS ABOVE 50


Summarizing table

4 Data processing with immunarch - TCR & BCR-IGH only

4.1 Filtering for TCR & BCR-IGH CDR3 sequences only

Now we are filtering out immunoglobulin information, to only keep TCR CDR3 sequences. And filtering out TCR CDR3 sequences to keep IGH CDR3 sequences.

Checκ if all samples are present.

4.1.1 Missing samples

We are also filtering out all samples not included in the analysis.

Many of the Liu samples are empty DFs, because no TCR CDR3 sequences were found.

Try:

  1. No downsampling
  • Filter functional / non-functional / in-frame / out-of-frame clonotypes
  • calculate repertoire diversity measures
    • chao1: a nonparameteric asymptotic estimator of species richness (number of species in a population)
    • shannon entropy
  • visualize for response
  1. Try downsampling
  • Downsample

  • Filter functional / non-functional / in-frame / out-of-frame clonotypes

  • calculate repertoire diversity measures

    • chao1: a nonparameteric asymptotic estimator of species richness (number of species in a population)
    • shannon entropy
  • visualize for response

4.2 Functional CDR3 and downsampling

immunarch package gives an error when trying to downsample to read clones in samples that have less reads than the downsampling threshold. So what we do is filter out beforehand all samples that have less clone reads than what is set. The reasoning behind this is that for samples that we have less read clones we basically cannot judge about clonality and richness, we consider this as a technical issue related to extracting TCR sequences from bulk RNA-Seq data, usually resulting in very low tCR seq abundances, from which we cannot make any diversity estimations.

Up until now we have five lists of data:

  • all TCR chains clones
  • TCRa clones
  • TCRb clones
  • TCRg clones
  • TCRd clones
  • IGH clones

In the downstream analysis from here on, we will continue with mainly alpha and beta CDR3 sequences, but try to process the others as well, in case we need them.

4.2.1 Filter out samples with low TCR CD3 reads

Melanoma datasets seem to have higher clone reads compared to RCC and most particularly comparing against bladder. Both Mariathasan and Powles have very low TCRbeta CDR3 clone reads.

Check figures below where for every sample, total TCR (all chains) reads and TCRbeta, TCRalpha reads are presented.

4.2.2 Only IGH & TCR clones

rplot <-sapply(datasets[1:10], function(x) plot_CountCloneReads(cloneReads.df,"TCR Clone reads","Clone Reads","dataset",x,yInter = 10,printLabels=FALSE,transformY=FALSE), simplify = FALSE)

4.2.2.1 Hugo

4.2.2.2 Riaz

4.2.2.3 Gide

4.2.2.4 Liu

4.2.2.5 Rod

4.2.2.6 Miao

4.2.2.7 Immotion150

4.2.2.8 Kim

4.2.2.9 Mari

4.2.2.10 Powles

4.2.3 All Ig & TCR clones

For a comparison with the full exported clones from mixcr we also calculate the clone reads for full TCR, Ig repertoire, and see how many samples pass the clone read threshold of 100.

rplotALL <-sapply(datasets[1:10], function(x) plot_CountCloneReads(cloneReads_all.df,"ALL Clone reads","Clone Reads","dataset",x,yInter = 100,printLabels=FALSE,transformY=TRUE), simplify = FALSE)

Merge all of them and see:

cloneReads_all.df.merged <- merge(cloneReads_all.df,cloneReads.df, by=c("sampleid", "dataset","title", "response"), all=TRUE)
colnames(cloneReads_all.df.merged )[5:6] <-c("CloneReads.ALL","CloneReads.TCR")
rplotALL <-sapply(datasets[1:10], function(x) plot_CountCloneReads(cloneReads_all.df.merged,"ALL Clone reads","Clone Reads","dataset",x,yInter = 100,printLabels=FALSE,transformY=TRUE), simplify = FALSE)

4.2.3.1 Hugo

4.2.3.2 Riaz

4.2.3.3 Gide

4.2.3.4 Liu

4.2.3.5 Rod

4.2.3.6 Miao

4.2.3.7 Immotion150

4.2.3.8 Kim

4.2.3.9 Mari

4.2.3.10 Powles

We can see that by taking the full Ig and ΤCR repertoire, there are a few samples who have less CDR3 sequencing reads than 100. We consider all samples passing this threshold to have clonotypes we can trust, so these are samples we trust, the rest sampels are excluded. Based on this, the downsampling threshold we set when selecting for TCR clones, and specifically, when downsampling TCRalpha chain or TCRbeta chain CDR3 sequences will be much less, for now 4 reads.

First finding which samples have very few reads. I am calculating for the following thresholds: 1. 0 reads 2. 1 reads 3. 4 reads 4. 10 reads 5. 20 reads 6. 50 reads

We continue with keeping samples that have >=4 clone reads.

5 Processing: functional clonotypes, downsampling, diversity estimation

Using functionalities provided by the immunarch package, we calculate the following diversity metrics:

  • Chao 1
  • Hill
  • d50
  • d30
  • Ecological diversity div
  • div30

5.1 Diversity metrics with immunarch

Now we make use of the immunarch package functionalities to calculate several diversity measures/indices. From the immunarch site:

Several approaches to the estimation of repertoire diversity are implemented in the repDiversity function. The .method parameter similarly to abovementionned functions sets the means for diversity estimation. You can choose one of the following methods:

  • chao1 - Chao1 estimator is a nonparameteric asymptotic estimator of species richness (number of species in a population).: chao1 returns 4 values: estimated number of species, standart deviation of this number and two 95. Chao1 is a nonparametric method for estimating the number of species in a community. The Chao richness estimator was developed by Anne Chao and is based on the concept that rare species infer the most information about the number of missing species

  • hill - Hill numbers are a mathematically unified family of diversity indices (differing only by an exponent q).

  • div - True diversity, or the effective number of types, refers to the number of equally-abundant types needed for the average proportional abundance of the types to equal that observed in the dataset of interest where all types may not be equally abundant.

  • gini.simp - The Gini-Simpson index is the probability of interspecific encounter, i.e., probability that two entities represent different types.

  • inv.simp - Inverse Simpson index is the effective number of types that is obtained when the weighted arithmetic mean is used to quantify average proportional abundance of types in the dataset of interest.

  • gini - The Gini coefficient measures the inequality among values of a frequency distribution (for example levels of income). A Gini coefficient of zero expresses perfect equality, where all values are the same (for example, where everyone has the same income). A Gini coefficient of one (or 100 percents ) expresses maximal inequality among values (for example where only one person has all the income).

  • raref - Rarefaction is a technique to assess species richness from the results of sampling through extrapolation.** PACKAGE GIVES ERROR**

  • d50 is a recently developed immune diversity estimate. It calculates the minimum number of distinct clonotypes amounting to greater than or equal to 50 percent of a total of sequencing reads obtained following amplification and sequencing

  • dXX is a similar to d50 index where XX corresponds to desirable percent of total sequencing reads

The .col parameter regulates what sequences and gene segments to choose, we choose to estimate diversity on the aminoacid level.

5.1.1 TCR alpha - downsampled

Visualizing the diversity estimations for each dataset

plotDivs <-sapply(datasets[1:10], function(x) diversityEst_vis(x,immdata.apd1.proc.TcrA.divEst[[x]]), simplify=FALSE)
5.1.1.0.1 Hugo

5.1.1.0.2 Riaz

5.1.1.0.3 Gide

5.1.1.0.4 Liu

5.1.1.0.5 Rod

5.1.1.0.6 Miao

5.1.1.0.7 Immotion150

5.1.1.0.8 Kim

5.1.1.0.9 Mari

5.1.1.0.10 Powles

5.1.2 TCR beta - downsampled

Visualizing the diversity estimations for each dataset

plotDivs <-sapply(datasets[1:10], function(x) diversityEst_vis(x,immdata.apd1.proc.TcrB.divEst[[x]]), simplify=FALSE)
5.1.2.0.1 Hugo

5.1.2.0.2 Riaz

5.1.2.0.3 Gide

5.1.2.0.4 Liu

5.1.2.0.5 Rod

5.1.2.0.6 Miao

5.1.2.0.7 Immotion150

5.1.2.0.8 Kim

5.1.2.0.9 Mari

5.1.2.0.10 Powles

5.1.3 IGH - downsampled

Visualizing the diversity estimations for each dataset

plotDivs <-sapply(datasets[1:10], function(x) diversityEst_vis(x,immdata.apd1.proc.igH.divEst[[x]]), simplify=FALSE)
5.1.3.0.1 Hugo

5.1.3.0.2 Riaz

5.1.3.0.3 Gide

5.1.3.0.4 Liu

5.1.3.0.5 Rod

5.1.3.0.6 Miao

5.1.3.0.7 Immotion150

5.1.3.0.8 Kim

5.1.3.0.9 Mari

5.1.3.0.10 Powles

5.1.4 TCR alpha - NOT downsampled

Visualizing the diversity estimations for each dataset

plotDivs <-sapply(datasets[1:10], function(x) diversityEst_vis(x,immdata.apd1.proc.TcrA.divEst.noDS[[x]]), simplify=FALSE)
5.1.4.0.1 Hugo

5.1.4.0.2 Riaz

5.1.4.0.3 Gide

5.1.4.0.4 Liu

5.1.4.0.5 Rod

5.1.4.0.6 Miao

5.1.4.0.7 Immotion150

5.1.4.0.8 Kim

5.1.4.0.9 Mari

5.1.4.0.10 Powles

5.1.5 TCR beta - NOT downsampled

Visualizing the diversity estimations for each dataset

plotDivs <-sapply(datasets[1:10], function(x) diversityEst_vis(x,immdata.apd1.proc.TcrB.divEst.noDS[[x]]), simplify=FALSE)

5.1.6 IGH beta - NOT downsampled

Visualizing the diversity estimations for each dataset

plotDivs <-sapply(datasets[1:10], function(x) diversityEst_vis(x,immdata.apd1.proc.igH.divEst.noDS[[x]]), simplify=FALSE)

5.2 Modeled entropy [binfotron package]

From the paper of Bortone et al.

Evenness was calculated as Shannon entropy / Log(Richness)(27). Evenness produces a comparable result to the “productive clonality” as reported by Adaptive Biotechnologies. Their “clonality” is 1 - evenness, although they use log base 2 in both their entropy and normalization (personal communication). d25, d50 and d75 (dXX) indices were calculated as described by VDJTools(28) GitHub repository: “The estimate equals to 1 - n / N, where n is the minimum number of clonotypes accounting for at least XX% of the total reads and code N is the total number of clonotypes in the sample.”

In the Model generation section:

Samples were balanced by the number of events for both the discovery-validation and test-training set splits. For the calculation of modeled entropy, analyzed samples were required to have an abundance of 8 or higher for both  and  TCR chains

Although it has been reported that evenness tends to overestimate diversity at lower abundances(41), we were surprised to see that this overestimation was much more pronounced in samples with lower initial diversity. Not only would this systematic overestimation of evenness obfuscate correlations with diversity, but in datasets where samples with higher true diversity also have higher abundance, this normalization could flip the direction of a correlation.

In the Results section: Regardless of the TCR chain or diversity index used, no measurements supported accurate estimation of true index values from RNA-seq simulated reads. An ideal diversity index should be stable across a range of abundance and entropy values. Surprisingly, even at the higher abundances of amplicon simulated reads, evenness still failed to track well with either the ground truth evenness or Shannon entropy.

In the Modeling entropy for low abundance samples section:

Given the stability of Shannon entropy as a diversity index across a wide range of parameters, we sought to derive a model to predict the true TCR repertoire entropy when abundances are low (i.e., in conditions of undersampling such as tumor RNA sequencing experiments).

Here we determined the abundance needed at a measured Shannon entropy to be within 5% of the ground truth Shannon entropy. If a sample’s measured Shannon entropy and abundance put the sample over this threshold the measured Shannon entropy was taken prima facie and no correction was made for our model entropy calculation

A robust estimate for diversity should also be more internally consistent, providing similar predictions across a range of TCR abundances.

Together, these results demonstrated that the model-based diversity estimates showedimproved correlations with true biological diversity and better internal consistency.

In the Predicting patient outcomes using TCR modeled entropy section:

With model-based corrected diversity providing better correlations with biological diversity than other metrics, we then asked: Does this improved metric of diversity improve prediction of survival of cancer patients? We note that even a perfect measurement of diversity may not predict survival in cancer.

Repertoire inference from RNA-seq data is untrustworthy whendegraded RNA derived from FFPE tissue is used as template for sequencing library preparation(46). IncreasedTCR repertoire diversity was associated with improved survival in melanoma patients treated with nivolumab.

Only the significance of TCR for all TCGAsamples was revealed if evenness was used as the diversity normalization

In the DISCUSSION section:

TCR repertoire diversity reflects the degree of clonal expansion in a T cell population and thus is a valuable biological readout as well as a potential biomarker of health or disease. Here, we showed that accurate measurement of TCR repertoire diversity required correction for undersampling that may occur as a result of biological limitations, such as testing only a small portion of a blood or tissue-resident T cell population of interest, or technical limitations such as low sequencing read coverage for a given sample. Optimal estimation of true population diversity also required use of the most accurate algorithm for TCR repertoire inference.

existing methods of calculating diversity from these repertoire counts were inaccurate under typical RNA-seq conditions. We developed a method for correcting diversity estimates that can be applied to any dataset using software supplied with this article. Of note, models built from TCR alpha data performed well when applied to TCR beta datasets and vice versa. Even “chain agnostic” models trained on subsampled population counts performed well on TCR data, suggesting that our method may be generally extensible to entropy estimation in the context of undersampling beyond TCR repertoire data as long as the expected diversity of the sampled population is within the range provided to the model. In datasets where sample entropy estimates fall outside the range of entropies upon which the model was trained (0.98 to 8.11), we recommend training a new entropy correction model on the desired true Shannon entropy range.

While our method of modeling the true Shannon entropy from RNA-seq data should effectively correct entropy measurements with different abundances, it should only be used to correct entropy within a batch of samples.

NOTE: I need to contact the authors to understand the above.

Given the differences in sensitivity across sample preparations and sequencing methods, it would be very difficult to make a generalizable tool that corrects across sequencing methods. For these high-quality high-abundance sequencing methods, we recommend using Shannon entropy as this metric approaches the ground truth Shannon entropy when the number of reads is in the tens of thousands range or higher. The use of evenness and “productive clonality” (1-evenness) is strongly discouraged.

Additionally, there is emerging evidence that immune checkpoint inhibition efficacy is driven by clonal expansion that happens after initiation of therapy(15,47,48). Thus, we expect that more accurate measurement of TCR repertoire diversity, as well as measurements of repertoire diversity while on therapy, will provide better biomarkers of response. We also expect that the magnitude and direction of associations between TCR repertoire diversity and clinical outcomes will vary by therapeutic mechanism of action. If, as our results suggest, clonal expansion occurs after administering immune checkpoint blockade, then monitoring the changes in diversity upon receiving ICP therapy could be an early indicator of how well a patient is responding to immunotherapy. Our method of accurately estimating diversity even with low counts may make such diagnostics readily affordable and aid in numerous fields that must rely upon assessing diversity from subsampled populations.

5.2.1 TCRA: Plots of modeled entropy

# TRA
#ds
modeledEntropy.tcrA.ds <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrA.divEst[[x]]$diversityEst$modEntr.ds, simplify = FALSE)
modeledEntropy.tcrA.ds.df <-ldply(modeledEntropy.tcrA.ds)
colnames(modeledEntropy.tcrA.ds.df)[1]<- "dataset"

#no ds
modeledEntropy.tcrA.nods <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrA.divEst.noDS[[x]]$diversityEst$modEntr.nods, simplify = FALSE)
modeledEntropy.tcrA.nods.df <-ldply(modeledEntropy.tcrA.nods)
colnames(modeledEntropy.tcrA.nods.df)[1]<- "dataset"

# Merge
modeledEntropy.tcrA.df <- merge(modeledEntropy.tcrA.ds.df,modeledEntropy.tcrA.nods.df, by=c("dataset","sampleid"), all=TRUE)
# Merge with sample data
modeledEntropy.tcrA.df <- merge(modeledEntropy.tcrA.df, samples_all.merged.fix[,c(1,2,10)], by.x = "sampleid", by.y = "run_accession", all.x=TRUE)

# TRB
#ds
modeledEntropy.tcrB.ds <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrB.divEst[[x]]$diversityEst$modEntr.ds, simplify = FALSE)
modeledEntropy.tcrB.ds.df <-ldply(modeledEntropy.tcrB.ds)
colnames(modeledEntropy.tcrB.ds.df)[1]<- "dataset"

#no ds
modeledEntropy.tcrB.nods <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrB.divEst.noDS[[x]]$diversityEst$modEntr.nods, simplify = FALSE)
modeledEntropy.tcrB.nods.df <-ldply(modeledEntropy.tcrB.nods)
colnames(modeledEntropy.tcrB.nods.df)[1]<- "dataset"

# Merge
modeledEntropy.tcrB.df <- merge(modeledEntropy.tcrB.ds.df,modeledEntropy.tcrB.nods.df, by=c("dataset","sampleid"), all=TRUE)
# Merge with sample data
modeledEntropy.tcrB.df <- merge(modeledEntropy.tcrB.df, samples_all.merged.fix[,c(1,2,10)], by.x = "sampleid", by.y = "run_accession", all.x=TRUE)

# IGH
#ds
modeledEntropy.igH.ds <-sapply(datasets[1:10], function(x) immdata.apd1.proc.igH.divEst[[x]]$diversityEst$modEntr.ds, simplify = FALSE)
modeledEntropy.igH.ds.df <-ldply(modeledEntropy.igH.ds)
colnames(modeledEntropy.igH.ds.df)[1]<- "dataset"

#no ds
modeledEntropy.igH.nods <-sapply(datasets[1:10], function(x) immdata.apd1.proc.igH.divEst.noDS[[x]]$diversityEst$modEntr.nods, simplify = FALSE)
modeledEntropy.igH.nods.df <-ldply(modeledEntropy.igH.nods)
colnames(modeledEntropy.igH.nods.df)[1]<- "dataset"

# Merge
modeledEntropy.igH.df <- merge(modeledEntropy.igH.ds.df,modeledEntropy.igH.nods.df, by=c("dataset","sampleid"), all=TRUE)
# Merge with sample data
modeledEntropy.igH.df <- merge(modeledEntropy.igH.df, samples_all.merged.fix[,c(1,2,10)], by.x = "sampleid", by.y = "run_accession", all.x=TRUE)
# TRA
rplotME <-sapply(datasets[1:10], function(x) plot_CountCloneReads(modeledEntropy.tcrA.df,"Modeled entropy - TCR alpha","Modeled Entropy","dataset",x,yInter = 0,printLabels=FALSE,transformY=FALSE), simplify = FALSE)

5.2.1.1 Hugo

5.2.1.2 Riaz

5.2.1.3 Gide

5.2.1.4 Liu

5.2.1.5 Rod

5.2.1.6 Miao

5.2.1.7 Immotion150

5.2.1.8 Kim

5.2.1.9 Mari

5.2.1.10 Powles

5.2.2 TCRB :Plots of modeled entropy

# TRB
rplotME <-sapply(datasets[1:10], function(x) plot_CountCloneReads(modeledEntropy.tcrB.df,"Modeled entropy - TCR alpha","Modeled Entropy","dataset",x,yInter = 0,printLabels=FALSE,transformY=FALSE), simplify = FALSE)

5.2.2.1 Hugo

5.2.2.2 Riaz

5.2.2.3 Gide

5.2.2.4 Liu

5.2.2.5 Rod

5.2.2.6 Miao

5.2.2.7 Immotion150

5.2.2.8 Kim

5.2.2.9 Mari

5.2.2.10 Powles

5.2.3 IGH :Plots of modeled entropy

# TRB
rplotME <-sapply(datasets[1:10], function(x) plot_CountCloneReads(modeledEntropy.igH.df,"Modeled entropy - BCR:IGH","Modeled Entropy","dataset",x,yInter = 0,printLabels=FALSE,transformY=FALSE), simplify = FALSE)

5.2.3.1 Hugo

5.2.3.2 Riaz

5.2.3.3 Gide

5.2.3.4 Liu

5.2.3.5 Rod

5.2.3.6 Miao

5.2.3.7 Immotion150

5.2.3.8 Kim

5.2.3.9 Mari

5.2.3.10 Powles

5.3 TCR richness - basic manual calculation

Calculate TCR richness (based on TCRa and TCRb chains separately) in non-downsampled and downsampled data.

5.3.1 TCRA: Plots of richness

# TRA
#ds
richness.tcrA.ds <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrA.divEst[[x]]$diversityEst$TCRArichness.ds, simplify = FALSE)
richness.tcrA.ds.df <-ldply(richness.tcrA.ds)
colnames(richness.tcrA.ds.df)[1]<- "dataset"

#no ds
richness.tcrA.nods <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrA.divEst.noDS[[x]]$diversityEst$TCRArichness.nods, simplify = FALSE)
richness.tcrA.nods.df <-ldply(richness.tcrA.nods)
colnames(richness.tcrA.nods.df)[1]<- "dataset"

# Merge
richness.tcrA.df <- merge(richness.tcrA.ds.df,richness.tcrA.nods.df, by=c("dataset","sampleid"), all=TRUE)
# Merge with sample data
richness.tcrA.df <- merge(richness.tcrA.df, samples_all.merged.fix[,c(1,2,10)], by.x = "sampleid", by.y = "run_accession", all.x=TRUE)

# TRB
#ds
richness.tcrB.ds <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrB.divEst[[x]]$diversityEst$TCRBrichness.ds, simplify = FALSE)
richness.tcrB.ds.df <-ldply(richness.tcrB.ds)
colnames(richness.tcrB.ds.df)[1]<- "dataset"

#no ds
richness.tcrB.nods <-sapply(datasets[1:10], function(x) immdata.apd1.proc.TcrB.divEst.noDS[[x]]$diversityEst$TCRBrichness.nods, simplify = FALSE)
richness.tcrB.nods.df <-ldply(richness.tcrB.nods)
colnames(richness.tcrB.nods.df)[1]<- "dataset"

# Merge
richness.tcrB.df <- merge(richness.tcrB.ds.df,richness.tcrB.nods.df, by=c("dataset","sampleid"), all=TRUE)
# Merge with sample data
richness.tcrB.df <- merge(richness.tcrB.df, samples_all.merged.fix[,c(1,2,10)], by.x = "sampleid", by.y = "run_accession", all.x=TRUE)

# IGH
#ds
richness.igH.ds <-sapply(datasets[1:10], function(x) immdata.apd1.proc.igH.divEst[[x]]$diversityEst$BCR.IGHrichness.ds, simplify = FALSE)
richness.igH.ds.df <-ldply(richness.igH.ds)
colnames(richness.igH.ds.df)[1]<- "dataset"

#no ds
richness.igH.nods <-sapply(datasets[1:10], function(x) immdata.apd1.proc.igH.divEst.noDS[[x]]$diversityEst$BCR.IGHrichness.nods, simplify = FALSE)
richness.igH.nods.df <-ldply(richness.igH.nods)
colnames(richness.igH.nods.df)[1]<- "dataset"

# Merge
richness.igH.df <- merge(richness.igH.ds.df,richness.igH.nods.df, by=c("dataset","sampleid"), all=TRUE)
# Merge with sample data
richness.igH.df <- merge(richness.igH.df, samples_all.merged.fix[,c(1,2,10)], by.x = "sampleid", by.y = "run_accession", all.x=TRUE)
# TRA
rplotME <-sapply(datasets[1:10], function(x) plot_CountCloneReads(richness.tcrA.df,"Richness - TCR alpha","Richness","dataset",x,yInter = 0,printLabels=FALSE,transformY=FALSE), simplify = FALSE)

5.3.1.1 Hugo

5.3.1.2 Riaz

5.3.1.3 Gide

5.3.1.4 Liu

5.3.1.5 Rod

5.3.1.6 Miao

5.3.1.7 Immotion150

5.3.1.8 Kim

5.3.1.9 Mari

5.3.1.10 Powles

5.3.2 TCRB: Plots of richness

# TRB
rplotME <-sapply(datasets[1:10], function(x) plot_CountCloneReads(richness.tcrB.df,"Richness - TCR alpha","Richness","dataset",x,yInter = 0,printLabels=FALSE,transformY=FALSE), simplify = FALSE)

5.3.2.1 Hugo

5.3.2.2 Riaz

5.3.2.3 Gide

5.3.2.4 Liu

5.3.2.5 Rod

5.3.2.6 Miao

5.3.2.7 Immotion150

5.3.2.8 Kim

5.3.2.9 Mari

5.3.2.10 Powles

5.3.3 BCR-IGH: Plots of richness

# TRB
rplotME <-sapply(datasets[1:10], function(x) plot_CountCloneReads(richness.igH.df,"Richness - BCR-IGH","Richness","dataset",x,yInter = 0,printLabels=FALSE,transformY=FALSE), simplify = FALSE)

5.3.3.1 Hugo

5.3.3.2 Riaz

5.3.3.3 Gide

5.3.3.4 Liu

5.3.3.5 Rod

5.3.3.6 Miao

5.3.3.7 Immotion150

5.3.3.8 Kim

5.3.3.9 Mari

5.3.3.10 Powles

5.4 Comments on downsampled vs downsampled

We have not performed proper downsampling, to a specific number of clones, due technical difficulties arising from extracting TCR and Ig clones from bulk RNA-Seq data (non enriched libraries to begin with). The strategy we followed so far, consists of keeping samples with more than 4 clones (more than 4 TCRalpha, more thatn 4 TCRbeta, more than 4 IGH etc.). So when we compare the richness or modeled entropy between downsampled and non-downsampled data, we see no difference, since for the samples kept, the clone information, clone counts, clones kept, etc have NOT changed.

6 Boxplots of diversity metrics

7 Immunarch data=minimum 4 clones

7.1 TCRA

7.1.1 Modeled Entropy

compareBox(immdata.apd1.proc.TcrA.divEst.fin, "response_gender","modEntr.ds",groupVar=NULL, facetVar ="dataset","","response_gender", "Modeled TCR alpha entropy")

compareBox(immdata.apd1.proc.TcrA.divEst.fin, "response","modEntr.ds",groupVar=NULL, facetVar ="tissue","","response", "Modeled TCR alpha entropy")

compareBox(immdata.apd1.proc.TcrA.divEst.fin, "response_gender","modEntr.ds",groupVar=NULL, facetVar ="tissue","","response_gender", "Modeled TCR alpha entropy")

immdata.apd1.proc.TcrA.divEst.fin$group <- paste(immdata.apd1.proc.TcrA.divEst.fin$response,"-",immdata.apd1.proc.TcrA.divEst.fin$gender)
compareBox(immdata.apd1.proc.TcrA.divEst.fin, "group","modEntr.ds",groupVar=NULL, facetVar ="dataset","","response", "Modeled TCR alpha entropy")


compareBox(immdata.apd1.proc.TcrA.divEst.fin, "response","modEntr.ds",groupVar=NULL, facetVar ="dataset","","response", "Modeled TCR alpha entropy")

7.1.2 TCR alpha richness

compareBox(immdata.apd1.proc.TcrA.divEst.fin, "response","TCRArichness.ds",groupVar=NULL, facetVar ="dataset","","response", "TCR alpha richness")

compareBox(immdata.apd1.proc.TcrA.divEst.fin, "response","TCRArichness.ds",groupVar=NULL, facetVar ="tissue","","response", "TCR alpha richness")

compareBox(immdata.apd1.proc.TcrA.divEst.fin, "response_gender","TCRArichness.ds",groupVar=NULL, facetVar ="tissue","","response_gender", "TCR alpha richness")

7.2 TCRB

7.2.1 Modeled Entropy

compareBox(immdata.apd1.proc.TcrB.divEst.fin, "response","modEntr.ds",groupVar=NULL, facetVar ="dataset","","response", "Modeled TCR beta entropy")

compareBox(immdata.apd1.proc.TcrB.divEst.fin, "response","modEntr.ds",groupVar=NULL, facetVar ="tissue","","response", "Modeled TCR beta entropy")

7.2.2 TCR beta richness

compareBox(immdata.apd1.proc.TcrB.divEst.fin, "response","TCRBrichness.ds",groupVar=NULL, facetVar ="dataset","","response", "TCR beta richness")

compareBox(immdata.apd1.proc.TcrB.divEst.fin, "response","TCRBrichness.ds",groupVar=NULL, facetVar ="tissue","","response", "TCR beta richness")

7.3 BCR-IGH

7.3.1 Modeled Entropy

compareBox(immdata.apd1.proc.igH.divEst.fin, "response","modEntr.ds",groupVar=NULL, facetVar ="dataset","","response", "Modeled BCR:IGH entropy")

compareBox(immdata.apd1.proc.igH.divEst.fin, "response","modEntr.ds",groupVar=NULL, facetVar ="tissue","","response", "Modeled BCR:IGH entropy")

7.3.2 BCR:IGH richness

compareBox(immdata.apd1.proc.igH.divEst.fin, "response","BCR.IGHrichness.ds",groupVar=NULL, facetVar ="dataset","","response", "BCR:IGH richness")

compareBox(immdata.apd1.proc.igH.divEst.fin, "response","BCR.IGHrichness.ds",groupVar=NULL, facetVar ="tissue","","response", "BCR:IGH richness")

7.3.3 BCR:IGH True Diversity

compareBox(immdata.apd1.proc.igH.divEst.fin, "response","Value.trueDiv",groupVar=NULL, facetVar ="dataset","","response", "BCR:IGH richness")

compareBox(immdata.apd1.proc.igH.divEst.fin, "response","Value.trueDiv",groupVar=NULL, facetVar ="tissue","","response", "BCR:IGH richness")

8 Save data

9 Session

session_info <- sessionInfo()
writeLines(capture.output(session_info), paste0(sessionInfoPath,"B_02_aPD1_TCR_BCR_clones_analysis.txt"))

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252    LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                           LC_TIME=English_United States.1252    
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] glmnet_4.1-4                Matrix_1.4-0                DivE_1.2                    rgeos_0.5-9                 sp_1.4-7                   
##  [6] FME_1.3.6.2                 coda_0.19-4                 rootSolve_1.8.2.3           deSolve_1.32                binfotron_0.5-01           
## [11] ggrepel_0.9.1               pander_0.6.5                reshape_0.8.9               immunarch_0.6.6             patchwork_1.1.2.9000       
## [16] dtplyr_1.2.1                vegan_2.6-2                 lattice_0.20-45             permute_0.9-7               entropy_1.3.1              
## [21] strex_1.4.2                 ggpubr_0.4.0                hrbrthemes_0.8.0            ggstatsplot_0.9.4.9000      estimate_1.0.13            
## [26] splitstackshape_1.4.8       randomcoloR_1.1.0.1         RColorBrewer_1.1-3          cluster_2.1.2               circlize_0.4.14            
## [31] scales_1.2.0                dichromat_2.0-0.1           ggthemes_4.2.4              data.table_1.14.2           ComplexHeatmap_2.10.0      
## [36] forcats_0.5.1               purrr_0.3.4                 tidyr_1.2.0                 ggplot2_3.3.6               tidyverse_1.3.1            
## [41] preprocessCore_1.56.0       e1071_1.7-9                 tibble_3.1.7                plyr_1.8.7                  stringr_1.4.0              
## [46] readr_2.1.2                 tximport_1.22.0             DESeq2_1.34.0               SummarizedExperiment_1.24.0 Biobase_2.54.0             
## [51] MatrixGenerics_1.6.0        matrixStats_0.62.0          GenomicRanges_1.46.1        GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [56] S4Vectors_0.32.4            BiocGenerics_0.40.0         dplyr_1.0.9                 DT_0.22                     extrafont_0.18             
## [61] tictoc_1.1                  rmarkdown_2.14              pacman_0.5.1               
## 
## loaded via a namespace (and not attached):
##   [1] class_7.3-20           foreach_1.5.2          crayon_1.5.2           V8_4.2.2               MASS_7.3-57            nlme_3.1-157          
##   [7] backports_1.4.1        reprex_2.0.1           rlang_1.1.3            XVector_0.34.0         caret_6.0-92           readxl_1.4.0          
##  [13] performance_0.10.1.4   extrafontdb_1.0        limma_3.50.3           BiocParallel_1.28.3    rjson_0.2.21           bit64_4.0.5           
##  [19] glue_1.6.2             pheatmap_1.0.12        AnnotationDbi_1.56.2   UpSetR_1.4.0           haven_2.5.1            tidyselect_1.1.2      
##  [25] XML_3.99-0.9           zoo_1.8-10             xtable_1.8-4           magrittr_2.0.3         evaluate_0.18          gdtools_0.2.4         
##  [31] cli_3.6.2              zlibbioc_1.40.0        rstudioapi_0.13        bslib_0.3.1            rpart_4.1.16           shiny_1.7.1           
##  [37] xfun_0.35              clue_0.3-60            parameters_0.20.0.7    KEGGREST_1.34.0        listenv_0.8.0          Biostrings_2.62.0     
##  [43] png_0.1-7              future_1.26.1          ipred_0.9-13           zeallot_0.1.0          withr_2.5.0            bitops_1.0-7          
##  [49] cellranger_1.1.0       hardhat_1.2.0          pROC_1.18.0            pillar_1.8.0           GlobalOptions_0.1.2    cachem_1.0.6          
##  [55] multcomp_1.4-20        fs_1.5.2               flexmix_2.3-17         kernlab_0.9-30         GetoptLong_1.0.5       paletteer_1.4.0       
##  [61] vctrs_0.4.1            ellipsis_0.3.2         generics_0.1.3         lava_1.6.10            tools_4.1.0            munsell_0.5.0         
##  [67] emmeans_1.7.4-1        proxy_0.4-26           DelayedArray_0.20.0    fastmap_1.1.0          compiler_4.1.0         abind_1.4-5           
##  [73] httpuv_1.6.5           GenomeInfoDbData_1.2.7 prodlim_2019.11.13     gridExtra_2.3          utf8_1.2.2             later_1.3.0           
##  [79] recipes_1.0.1          jsonlite_1.8.3         carData_3.0-5          estimability_1.3       genefilter_1.76.0      promises_1.2.0.1      
##  [85] car_3.1-1              doParallel_1.0.17      checkmate_2.1.0        sandwich_3.0-2         Rtsne_0.16             survival_3.3-1        
##  [91] yaml_2.3.6             systemfonts_1.0.4      prabclus_2.3-2         htmltools_0.5.3        minpack.lm_1.2-2       memoise_2.0.1         
##  [97] modeltools_0.2-23      locfit_1.5-9.5         digest_0.6.29          assertthat_0.2.1       mime_0.12              Rttf2pt1_1.3.10       
## [103] bayestestR_0.13.0.5    RSQLite_2.2.13         future.apply_1.9.0     blob_1.2.3             labeling_0.4.2         shinythemes_1.2.0     
## [109] splines_4.1.0          rematch2_2.1.2         fpc_2.2-9              RCurl_1.98-1.6         broom_1.0.1            hms_1.1.2             
## [115] modelr_0.1.8           colorspace_2.0-3       shape_1.4.6            nnet_7.3-17            sass_0.4.1             Rcpp_1.0.9            
## [121] mclust_5.4.9           mvtnorm_1.1-3          ggseqlogo_0.1          fansi_1.0.3            tzdb_0.3.0             parallelly_1.32.1     
## [127] ModelMetrics_1.2.2.2   R6_2.5.1               factoextra_1.0.7       lifecycle_1.0.1        statsExpressions_1.3.3 datawizard_0.6.5      
## [133] curl_5.2.0             ggsignif_0.6.3         minqa_1.2.5            jquerylib_0.1.4        robustbase_0.95-0      TH.data_1.1-1         
## [139] iterators_1.0.14       gower_1.0.0            htmlwidgets_1.5.4      crosstalk_1.2.0        rvest_1.0.2            mgcv_1.8-40           
## [145] globals_0.15.1         insight_0.18.8         codetools_0.2-18       lubridate_1.8.0        dbplyr_2.1.1           correlation_0.8.3     
## [151] gtable_0.3.1           DBI_1.1.2              ggalluvial_0.12.3      httr_1.4.7             highr_0.9              stringi_1.7.8         
## [157] vroom_1.6.0            reshape2_1.4.4         farver_2.1.1           diptest_0.76-0         annotate_1.72.0        magick_2.7.3          
## [163] timeDate_4021.104      xml2_1.3.3             geneplotter_1.72.0     DEoptimR_1.0-11        bit_4.0.5              pkgconfig_2.0.3       
## [169] rstatix_0.7.0          knitr_1.41